{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Ingestion Data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## You'll need to install the following libraries if they are not already installed:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pip install elasticsearch sentence-transformers pyyaml"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Library/Python/3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "import yaml\n",
    "from elasticsearch import Elasticsearch, helpers\n",
    "from sentence_transformers import SentenceTransformer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Step 1: Elasticsearch client setup using cloud configuration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_client_es():\n",
    "    \"\"\"\n",
    "    Initializes Elasticsearch client using cloud_id and api_key from config.yml\n",
    "    \"\"\"\n",
    "    with open(\"../config.yml\", \"r\") as file:\n",
    "        config = yaml.safe_load(file)\n",
    "    return Elasticsearch(cloud_id=config[\"cloud_id\"], api_key=config[\"api_key\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Step 2: Text Vectorization using SentenceTransformers\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_text_vector(sentences):\n",
    "    \"\"\"\n",
    "    Generates sentence embeddings using pre-trained model 'all-MiniLM-L6-v2'.\n",
    "    \"\"\"\n",
    "    model = SentenceTransformer(\"sentence-transformers/all-MiniLM-L6-v2\")\n",
    "    embeddings = model.encode(sentences)\n",
    "    return embeddings"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Step 3: Read JSON file containing the dataset\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_json_file(file_path):\n",
    "    \"\"\"\n",
    "    Reads and loads the dataset from a JSON file.\n",
    "    \"\"\"\n",
    "    with open(file_path, \"r\") as file:\n",
    "        data = json.load(file)\n",
    "    return data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Step 4: Chunk data for batch processing\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def chunk_data(data, batch_size):\n",
    "    \"\"\"\n",
    "    Yields chunks of data in batch sizes for bulk indexing in Elasticsearch.\n",
    "    \"\"\"\n",
    "    for i in range(0, len(data), batch_size):\n",
    "        yield data[i : i + batch_size]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Step 5: Generate bulk actions for Elasticsearch indexing\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_bulk_actions(index_name, data_batch):\n",
    "    \"\"\"\n",
    "    Generates bulk actions for Elasticsearch from data batches.\n",
    "    Adds 'description_embeddings' by encoding the 'description' field.\n",
    "    \"\"\"\n",
    "    for item in data_batch:\n",
    "        document_id = item[\"id\"]\n",
    "        item[\"description_embeddings\"] = get_text_vector(item[\"description\"])\n",
    "        yield {\"_index\": index_name, \"_id\": document_id, \"_source\": item}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Step 6: Indexing data in batches to Elasticsearch\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def index_data_in_batches(file_path, index_name, batch_size=100):\n",
    "    \"\"\"\n",
    "    Indexes data from the JSON file in batches using Elasticsearch helpers.bulk.\n",
    "    \"\"\"\n",
    "    data = read_json_file(file_path)\n",
    "\n",
    "    for batch in chunk_data(data, batch_size):\n",
    "        actions = generate_bulk_actions(index_name, batch)\n",
    "        success, failed = helpers.bulk(get_client_es(), actions)\n",
    "        print(f\"Batch indexed: {success} successful, {failed} failed\")\n",
    "\n",
    "\n",
    "# main execution block\n",
    "# if __name__ == '__main__':\n",
    "#     index_data_in_batches(\"../files/dataset/products.json\", \"products-catalog\", batch_size=100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Batch indexed: 100 successful, [] failed\n",
      "Batch indexed: 100 successful, [] failed\n"
     ]
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[10], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mindex_data_in_batches\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m../files/dataset/products.json\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mproducts-catalog-2\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m100\u001b[39;49m\u001b[43m)\u001b[49m\n",
      "Cell \u001b[0;32mIn[6], line 9\u001b[0m, in \u001b[0;36mindex_data_in_batches\u001b[0;34m(file_path, index_name, batch_size)\u001b[0m\n\u001b[1;32m      7\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m batch \u001b[38;5;129;01min\u001b[39;00m chunk_data(data, batch_size):\n\u001b[1;32m      8\u001b[0m     actions \u001b[38;5;241m=\u001b[39m generate_bulk_actions(index_name, batch)\n\u001b[0;32m----> 9\u001b[0m     success, failed \u001b[38;5;241m=\u001b[39m \u001b[43mhelpers\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbulk\u001b[49m\u001b[43m(\u001b[49m\u001b[43mget_client_es\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mactions\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     10\u001b[0m     \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mBatch indexed: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msuccess\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m successful, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfailed\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m failed\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
      "File \u001b[0;32m/Library/Python/3.9/site-packages/elasticsearch/helpers/actions.py:531\u001b[0m, in \u001b[0;36mbulk\u001b[0;34m(client, actions, stats_only, ignore_status, *args, **kwargs)\u001b[0m\n\u001b[1;32m    529\u001b[0m \u001b[38;5;66;03m# make streaming_bulk yield successful results so we can count them\u001b[39;00m\n\u001b[1;32m    530\u001b[0m kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124myield_ok\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 531\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m ok, item \u001b[38;5;129;01min\u001b[39;00m streaming_bulk(\n\u001b[1;32m    532\u001b[0m     client, actions, ignore_status\u001b[38;5;241m=\u001b[39mignore_status, span_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhelpers.bulk\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m    533\u001b[0m ):\n\u001b[1;32m    534\u001b[0m     \u001b[38;5;66;03m# go through request-response pairs and detect failures\u001b[39;00m\n\u001b[1;32m    535\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m ok:\n\u001b[1;32m    536\u001b[0m         \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m stats_only:\n",
      "File \u001b[0;32m/Library/Python/3.9/site-packages/elasticsearch/helpers/actions.py:427\u001b[0m, in \u001b[0;36mstreaming_bulk\u001b[0;34m(client, actions, chunk_size, max_chunk_bytes, raise_on_error, expand_action_callback, raise_on_exception, max_retries, initial_backoff, max_backoff, yield_ok, ignore_status, span_name, *args, **kwargs)\u001b[0m\n\u001b[1;32m    420\u001b[0m bulk_data: List[\n\u001b[1;32m    421\u001b[0m     Union[\n\u001b[1;32m    422\u001b[0m         Tuple[_TYPE_BULK_ACTION_HEADER],\n\u001b[1;32m    423\u001b[0m         Tuple[_TYPE_BULK_ACTION_HEADER, _TYPE_BULK_ACTION_BODY],\n\u001b[1;32m    424\u001b[0m     ]\n\u001b[1;32m    425\u001b[0m ]\n\u001b[1;32m    426\u001b[0m bulk_actions: List[\u001b[38;5;28mbytes\u001b[39m]\n\u001b[0;32m--> 427\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m bulk_data, bulk_actions \u001b[38;5;129;01min\u001b[39;00m _chunk_actions(\n\u001b[1;32m    428\u001b[0m     \u001b[38;5;28mmap\u001b[39m(expand_action_callback, actions),\n\u001b[1;32m    429\u001b[0m     chunk_size,\n\u001b[1;32m    430\u001b[0m     max_chunk_bytes,\n\u001b[1;32m    431\u001b[0m     serializer,\n\u001b[1;32m    432\u001b[0m ):\n\u001b[1;32m    433\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m attempt \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(max_retries \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m1\u001b[39m):\n\u001b[1;32m    434\u001b[0m         to_retry: List[\u001b[38;5;28mbytes\u001b[39m] \u001b[38;5;241m=\u001b[39m []\n",
      "File \u001b[0;32m/Library/Python/3.9/site-packages/elasticsearch/helpers/actions.py:234\u001b[0m, in \u001b[0;36m_chunk_actions\u001b[0;34m(actions, chunk_size, max_chunk_bytes, serializer)\u001b[0m\n\u001b[1;32m    227\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    228\u001b[0m \u001b[38;5;124;03mSplit actions into chunks by number or size, serialize them into strings in\u001b[39;00m\n\u001b[1;32m    229\u001b[0m \u001b[38;5;124;03mthe process.\u001b[39;00m\n\u001b[1;32m    230\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    231\u001b[0m chunker \u001b[38;5;241m=\u001b[39m _ActionChunker(\n\u001b[1;32m    232\u001b[0m     chunk_size\u001b[38;5;241m=\u001b[39mchunk_size, max_chunk_bytes\u001b[38;5;241m=\u001b[39mmax_chunk_bytes, serializer\u001b[38;5;241m=\u001b[39mserializer\n\u001b[1;32m    233\u001b[0m )\n\u001b[0;32m--> 234\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m action, data \u001b[38;5;129;01min\u001b[39;00m actions:\n\u001b[1;32m    235\u001b[0m     ret \u001b[38;5;241m=\u001b[39m chunker\u001b[38;5;241m.\u001b[39mfeed(action, data)\n\u001b[1;32m    236\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m ret:\n",
      "Cell \u001b[0;32mIn[5], line 8\u001b[0m, in \u001b[0;36mgenerate_bulk_actions\u001b[0;34m(index_name, data_batch)\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m data_batch:\n\u001b[1;32m      7\u001b[0m     document_id \u001b[38;5;241m=\u001b[39m item[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mid\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[0;32m----> 8\u001b[0m     item[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription_embeddings\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mget_text_vector\u001b[49m\u001b[43m(\u001b[49m\u001b[43mitem\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mdescription\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m      9\u001b[0m     \u001b[38;5;28;01myield\u001b[39;00m {\n\u001b[1;32m     10\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_index\u001b[39m\u001b[38;5;124m\"\u001b[39m: index_name,\n\u001b[1;32m     11\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_id\u001b[39m\u001b[38;5;124m\"\u001b[39m: document_id,\n\u001b[1;32m     12\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_source\u001b[39m\u001b[38;5;124m\"\u001b[39m: item\n\u001b[1;32m     13\u001b[0m     }\n",
      "Cell \u001b[0;32mIn[2], line 5\u001b[0m, in \u001b[0;36mget_text_vector\u001b[0;34m(sentences)\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_text_vector\u001b[39m(sentences):\n\u001b[1;32m      2\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m      3\u001b[0m \u001b[38;5;124;03m    Generates sentence embeddings using pre-trained model 'all-MiniLM-L6-v2'.\u001b[39;00m\n\u001b[1;32m      4\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m----> 5\u001b[0m     model \u001b[38;5;241m=\u001b[39m \u001b[43mSentenceTransformer\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43msentence-transformers/all-MiniLM-L6-v2\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m      6\u001b[0m     embeddings \u001b[38;5;241m=\u001b[39m model\u001b[38;5;241m.\u001b[39mencode(sentences)\n\u001b[1;32m      7\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m embeddings\n",
      "File \u001b[0;32m/Library/Python/3.9/site-packages/sentence_transformers/SentenceTransformer.py:95\u001b[0m, in \u001b[0;36mSentenceTransformer.__init__\u001b[0;34m(self, model_name_or_path, modules, device, cache_folder, use_auth_token)\u001b[0m\n\u001b[1;32m     87\u001b[0m         snapshot_download(model_name_or_path,\n\u001b[1;32m     88\u001b[0m                             cache_dir\u001b[38;5;241m=\u001b[39mcache_folder,\n\u001b[1;32m     89\u001b[0m                             library_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124msentence-transformers\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m     90\u001b[0m                             library_version\u001b[38;5;241m=\u001b[39m__version__,\n\u001b[1;32m     91\u001b[0m                             ignore_files\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mflax_model.msgpack\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mrust_model.ot\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtf_model.h5\u001b[39m\u001b[38;5;124m'\u001b[39m],\n\u001b[1;32m     92\u001b[0m                             use_auth_token\u001b[38;5;241m=\u001b[39muse_auth_token)\n\u001b[1;32m     94\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mexists(os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(model_path, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmodules.json\u001b[39m\u001b[38;5;124m'\u001b[39m)):    \u001b[38;5;66;03m#Load as SentenceTransformer model\u001b[39;00m\n\u001b[0;32m---> 95\u001b[0m     modules \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_load_sbert_model\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     96\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:   \u001b[38;5;66;03m#Load with AutoModel\u001b[39;00m\n\u001b[1;32m     97\u001b[0m     modules \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_load_auto_model(model_path)\n",
      "File \u001b[0;32m/Library/Python/3.9/site-packages/sentence_transformers/SentenceTransformer.py:840\u001b[0m, in \u001b[0;36mSentenceTransformer._load_sbert_model\u001b[0;34m(self, model_path)\u001b[0m\n\u001b[1;32m    838\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m module_config \u001b[38;5;129;01min\u001b[39;00m modules_config:\n\u001b[1;32m    839\u001b[0m     module_class \u001b[38;5;241m=\u001b[39m import_from_string(module_config[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtype\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\u001b[0;32m--> 840\u001b[0m     module \u001b[38;5;241m=\u001b[39m \u001b[43mmodule_class\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjoin\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodule_config\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mpath\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    841\u001b[0m     modules[module_config[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m'\u001b[39m]] \u001b[38;5;241m=\u001b[39m module\n\u001b[1;32m    843\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m modules\n",
      "File \u001b[0;32m/Library/Python/3.9/site-packages/sentence_transformers/models/Transformer.py:137\u001b[0m, in \u001b[0;36mTransformer.load\u001b[0;34m(input_path)\u001b[0m\n\u001b[1;32m    135\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(sbert_config_path) \u001b[38;5;28;01mas\u001b[39;00m fIn:\n\u001b[1;32m    136\u001b[0m     config \u001b[38;5;241m=\u001b[39m json\u001b[38;5;241m.\u001b[39mload(fIn)\n\u001b[0;32m--> 137\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mTransformer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_name_or_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minput_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m/Library/Python/3.9/site-packages/sentence_transformers/models/Transformer.py:29\u001b[0m, in \u001b[0;36mTransformer.__init__\u001b[0;34m(self, model_name_or_path, max_seq_length, model_args, cache_dir, tokenizer_args, do_lower_case, tokenizer_name_or_path)\u001b[0m\n\u001b[1;32m     26\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdo_lower_case \u001b[38;5;241m=\u001b[39m do_lower_case\n\u001b[1;32m     28\u001b[0m config \u001b[38;5;241m=\u001b[39m AutoConfig\u001b[38;5;241m.\u001b[39mfrom_pretrained(model_name_or_path, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mmodel_args, cache_dir\u001b[38;5;241m=\u001b[39mcache_dir)\n\u001b[0;32m---> 29\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_load_model\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_name_or_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     31\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtokenizer \u001b[38;5;241m=\u001b[39m AutoTokenizer\u001b[38;5;241m.\u001b[39mfrom_pretrained(tokenizer_name_or_path \u001b[38;5;28;01mif\u001b[39;00m tokenizer_name_or_path \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m model_name_or_path, cache_dir\u001b[38;5;241m=\u001b[39mcache_dir, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mtokenizer_args)\n\u001b[1;32m     33\u001b[0m \u001b[38;5;66;03m#No max_seq_length set. Try to infer from model\u001b[39;00m\n",
      "File \u001b[0;32m/Library/Python/3.9/site-packages/sentence_transformers/models/Transformer.py:49\u001b[0m, in \u001b[0;36mTransformer._load_model\u001b[0;34m(self, model_name_or_path, config, cache_dir)\u001b[0m\n\u001b[1;32m     47\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_load_t5_model(model_name_or_path, config, cache_dir)\n\u001b[1;32m     48\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 49\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mauto_model \u001b[38;5;241m=\u001b[39m \u001b[43mAutoModel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_pretrained\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_name_or_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_dir\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m/Library/Python/3.9/site-packages/transformers/models/auto/auto_factory.py:463\u001b[0m, in \u001b[0;36m_BaseAutoModelClass.from_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, *model_args, **kwargs)\u001b[0m\n\u001b[1;32m    461\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mtype\u001b[39m(config) \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m_model_mapping\u001b[38;5;241m.\u001b[39mkeys():\n\u001b[1;32m    462\u001b[0m     model_class \u001b[38;5;241m=\u001b[39m _get_model_class(config, \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m_model_mapping)\n\u001b[0;32m--> 463\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmodel_class\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_pretrained\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    464\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpretrained_model_name_or_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mmodel_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mhub_kwargs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\n\u001b[1;32m    465\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    466\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m    467\u001b[0m     \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnrecognized configuration class \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mconfig\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m for this kind of AutoModel: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    468\u001b[0m     \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mModel type should be one of \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(c\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mfor\u001b[39;00m\u001b[38;5;250m \u001b[39mc\u001b[38;5;250m \u001b[39m\u001b[38;5;129;01min\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m_model_mapping\u001b[38;5;241m.\u001b[39mkeys())\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    469\u001b[0m )\n",
      "File \u001b[0;32m/Library/Python/3.9/site-packages/transformers/modeling_utils.py:2228\u001b[0m, in \u001b[0;36mPreTrainedModel.from_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, *model_args, **kwargs)\u001b[0m\n\u001b[1;32m   2225\u001b[0m     init_contexts\u001b[38;5;241m.\u001b[39mappend(init_empty_weights())\n\u001b[1;32m   2227\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m ContextManagers(init_contexts):\n\u001b[0;32m-> 2228\u001b[0m     model \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mmodel_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mmodel_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2230\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m load_in_8bit:\n\u001b[1;32m   2231\u001b[0m     \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbitsandbytes\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m get_keys_to_not_convert, replace_8bit_linear\n",
      "File \u001b[0;32m/Library/Python/3.9/site-packages/transformers/models/bert/modeling_bert.py:884\u001b[0m, in \u001b[0;36mBertModel.__init__\u001b[0;34m(self, config, add_pooling_layer)\u001b[0m\n\u001b[1;32m    881\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig \u001b[38;5;241m=\u001b[39m config\n\u001b[1;32m    883\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39membeddings \u001b[38;5;241m=\u001b[39m BertEmbeddings(config)\n\u001b[0;32m--> 884\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mencoder \u001b[38;5;241m=\u001b[39m \u001b[43mBertEncoder\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    886\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpooler \u001b[38;5;241m=\u001b[39m BertPooler(config) \u001b[38;5;28;01mif\u001b[39;00m add_pooling_layer \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    888\u001b[0m \u001b[38;5;66;03m# Initialize weights and apply final processing\u001b[39;00m\n",
      "File \u001b[0;32m/Library/Python/3.9/site-packages/transformers/models/bert/modeling_bert.py:552\u001b[0m, in \u001b[0;36mBertEncoder.__init__\u001b[0;34m(self, config)\u001b[0m\n\u001b[1;32m    550\u001b[0m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__init__\u001b[39m()\n\u001b[1;32m    551\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig \u001b[38;5;241m=\u001b[39m config\n\u001b[0;32m--> 552\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlayer \u001b[38;5;241m=\u001b[39m nn\u001b[38;5;241m.\u001b[39mModuleList([BertLayer(config) \u001b[38;5;28;01mfor\u001b[39;00m _ \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(config\u001b[38;5;241m.\u001b[39mnum_hidden_layers)])\n\u001b[1;32m    553\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgradient_checkpointing \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n",
      "File \u001b[0;32m/Library/Python/3.9/site-packages/transformers/models/bert/modeling_bert.py:552\u001b[0m, in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m    550\u001b[0m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__init__\u001b[39m()\n\u001b[1;32m    551\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig \u001b[38;5;241m=\u001b[39m config\n\u001b[0;32m--> 552\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlayer \u001b[38;5;241m=\u001b[39m nn\u001b[38;5;241m.\u001b[39mModuleList([\u001b[43mBertLayer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m _ \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(config\u001b[38;5;241m.\u001b[39mnum_hidden_layers)])\n\u001b[1;32m    553\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgradient_checkpointing \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n",
      "File \u001b[0;32m/Library/Python/3.9/site-packages/transformers/models/bert/modeling_bert.py:474\u001b[0m, in \u001b[0;36mBertLayer.__init__\u001b[0;34m(self, config)\u001b[0m\n\u001b[1;32m    472\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m should be used as a decoder model if cross attention is added\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    473\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcrossattention \u001b[38;5;241m=\u001b[39m BertAttention(config, position_embedding_type\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mabsolute\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 474\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mintermediate \u001b[38;5;241m=\u001b[39m \u001b[43mBertIntermediate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    475\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput \u001b[38;5;241m=\u001b[39m BertOutput(config)\n",
      "File \u001b[0;32m/Library/Python/3.9/site-packages/transformers/models/bert/modeling_bert.py:436\u001b[0m, in \u001b[0;36mBertIntermediate.__init__\u001b[0;34m(self, config)\u001b[0m\n\u001b[1;32m    434\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, config):\n\u001b[1;32m    435\u001b[0m     \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__init__\u001b[39m()\n\u001b[0;32m--> 436\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdense \u001b[38;5;241m=\u001b[39m \u001b[43mnn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mLinear\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mhidden_size\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mintermediate_size\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    437\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(config\u001b[38;5;241m.\u001b[39mhidden_act, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m    438\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mintermediate_act_fn \u001b[38;5;241m=\u001b[39m ACT2FN[config\u001b[38;5;241m.\u001b[39mhidden_act]\n",
      "File \u001b[0;32m/Library/Python/3.9/site-packages/torch/nn/modules/linear.py:104\u001b[0m, in \u001b[0;36mLinear.__init__\u001b[0;34m(self, in_features, out_features, bias, device, dtype)\u001b[0m\n\u001b[1;32m    102\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    103\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mregister_parameter(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbias\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[0;32m--> 104\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreset_parameters\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m/Library/Python/3.9/site-packages/torch/nn/modules/linear.py:110\u001b[0m, in \u001b[0;36mLinear.reset_parameters\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    106\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mreset_parameters\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    107\u001b[0m     \u001b[38;5;66;03m# Setting a=sqrt(5) in kaiming_uniform is the same as initializing with\u001b[39;00m\n\u001b[1;32m    108\u001b[0m     \u001b[38;5;66;03m# uniform(-1/sqrt(in_features), 1/sqrt(in_features)). For details, see\u001b[39;00m\n\u001b[1;32m    109\u001b[0m     \u001b[38;5;66;03m# https://github.com/pytorch/pytorch/issues/57109\u001b[39;00m\n\u001b[0;32m--> 110\u001b[0m     \u001b[43minit\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mkaiming_uniform_\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43ma\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmath\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msqrt\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m5\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    111\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbias \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    112\u001b[0m         fan_in, _ \u001b[38;5;241m=\u001b[39m init\u001b[38;5;241m.\u001b[39m_calculate_fan_in_and_fan_out(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mweight)\n",
      "File \u001b[0;32m/Library/Python/3.9/site-packages/torch/nn/init.py:460\u001b[0m, in \u001b[0;36mkaiming_uniform_\u001b[0;34m(tensor, a, mode, nonlinearity, generator)\u001b[0m\n\u001b[1;32m    458\u001b[0m bound \u001b[38;5;241m=\u001b[39m math\u001b[38;5;241m.\u001b[39msqrt(\u001b[38;5;241m3.0\u001b[39m) \u001b[38;5;241m*\u001b[39m std  \u001b[38;5;66;03m# Calculate uniform bounds from standard deviation\u001b[39;00m\n\u001b[1;32m    459\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad():\n\u001b[0;32m--> 460\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtensor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43muniform_\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[43mbound\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbound\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgenerator\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgenerator\u001b[49m\u001b[43m)\u001b[49m\n",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "index_data_in_batches(\n",
    "    \"../files/dataset/products.json\", \"products-catalog-2\", batch_size=100\n",
    ")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
